from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#Importing the libraries needed for entire project
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import sklearn
import mglearn
import warnings
import calendar
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pycats
from sklearn.metrics import roc_auc_score, roc_curve
import statsmodels.api as sm
import itertools
sns.set()
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline
crimes = pd.read_csv ('final_crimes.csv')
# referenced Dr. Yingjie Zhang code
crimes['Type_factor'].unique()
crimes['Location Description'] = pd.factorize(crimes['Location Description'])[0]
#model 1
target_pt = 'Type_factor'
features_pt = ['X Coordinate','Y Coordinate', 'Hour','Month','Arrest', 'Community Area', 'Location Description']
#Split dataset to Training Set & Test Set
X, y = train_test_split(crimes,
test_size = 0.3,
train_size = 0.7,
random_state= 0)
X_train_pt = X[features_pt]
y_train_pt = X[target_pt]
X_test_pt = y[features_pt]
y_test_pt = y[target_pt]
print('Feature Set Used : ', features_pt)
print('Target Class : ', target_pt)
print('Training Set Size : ', X.shape)
print('Test Set Size : ', y.shape)
tree1_pt = DecisionTreeClassifier(class_weight=None,
criterion='gini',
max_depth=None,
min_samples_leaf=50,
max_leaf_nodes=None,
splitter='best',
random_state=0)
tree1_pt.fit(X_train_pt, y_train_pt)
print("Accuracy on training set: {:.3f}".format(tree1_pt.score(X_train_pt, y_train_pt)))
print("Accuracy on test set: {:.3f}".format(tree1_pt.score(X_test_pt, y_test_pt)))
# Can accuracy be better? Maybe take out variables
#model 2
target_pt = 'Primary Type'
features_pt = ['X Coordinate','Y Coordinate', 'Hour','Month', 'Community Area', 'Location Description']
X, y = train_test_split(crimes,
test_size = 0.3,
train_size = 0.7,
random_state= 0)
X_train_pt = X[features_pt]
y_train_pt = X[target_pt]
X_test_pt = y[features_pt]
y_test_pt = y[target_pt]
print('Feature Set Used : ', features_pt)
print('Target Class : ', target_pt)
print('Training Set Size : ', X.shape)
print('Test Set Size : ', y.shape)
tree2_pt = DecisionTreeClassifier(class_weight=None,
criterion='gini',
max_depth=None,
min_samples_leaf=50,
max_leaf_nodes=None,
splitter='best',
random_state=0)
tree2_pt.fit(X_train_pt, y_train_pt)
print("Accuracy on training set: {:.3f}".format(tree2_pt.score(X_train_pt, y_train_pt)))
print("Accuracy on test set: {:.3f}".format(tree2_pt.score(X_test_pt, y_test_pt)))
#Worse accuracy - do feature test
#many variables to test importance
target_pt = 'Type_factor'
features_pt = ['X Coordinate','Y Coordinate', 'Year', 'Hour','Month', 'Arrest','Ward',
'District','Beat','Domestic','Community Area', 'Location Description']
tree_many_pt= DecisionTreeClassifier()
#Split dataset to Training Set & Test Set
X, y = train_test_split(crimes,
test_size = 0.3,
train_size = 0.7,
random_state= 0)
X_train_pt = X[features_pt]
y_train_pt = X[target_pt]
X_test_pt = y[features_pt]
y_test_pt = y[target_pt]
tree_many_pt = DecisionTreeClassifier()
tree_many_pt.fit(X_train_pt, y_train_pt)
tree_many_pt.feature_importances_
#tree after running feature test
target_pt = 'Type_factor'
features_pt = ['X Coordinate','Y Coordinate', 'Year', 'Hour','Month',
'Arrest','Ward','Beat','Domestic', 'Community Area', 'Location Description']
X, y = train_test_split(crimes,
test_size = 0.3,
train_size = 0.7,
random_state= 0)
X_train_pt = X[features_pt]
y_train_pt = X[target_pt]
X_test_pt = y[features_pt]
y_test_pt = y[target_pt]
print('Feature Set Used : ', features_pt)
print('Target Class : ', target_pt)
print('Training Set Size : ', X.shape)
print('Test Set Size : ', y.shape)
tree_final_pt = DecisionTreeClassifier(class_weight=None,
criterion='gini',
max_depth=100,
min_samples_leaf=50,
max_leaf_nodes=None,
splitter='best',
random_state=0)
tree_final_pt.fit(X_train_pt, y_train_pt)
print("Accuracy on training set: {:.3f}".format(tree_final_pt.score(X_train_pt, y_train_pt)))
print("Accuracy on test set: {:.3f}".format(tree_final_pt.score(X_test_pt, y_test_pt)))
# best accurancy that I could find.
# Ran before collapsing primary time and location description. MUCH BETTER using collaped variables
# Referenced to plot tree - https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
from sklearn.tree import export_graphviz
print(features_pt)
export_graphviz(
tree_final_pt,
feature_names=features_pt,
out_file="tree_pt.dot",
max_depth=10
)
import graphviz
from IPython.display import display
with open("tree_pt.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
target_arrest = 'Arrest'
features_arrest = ['Beat', 'District', 'Ward', 'Community Area', 'Year', 'Month', 'Location Description', 'Type_factor']
#Split dataset to Training Set & Test Set
X, y = train_test_split(crimes,
test_size = 0.3,
train_size = 0.7,
random_state= 0)
X_train_arrest = X[features_arrest]
y_train_arrest = X[target_arrest]
X_test_arrest = y[features_arrest]
y_test_arrest = y[target_arrest]
print('Feature Set Used : ', features_arrest)
print('Target Class : ', target_arrest)
print('Training Set Size : ', X.shape)
print('Test Set Size : ', y.shape)
#feature test
tree_arrest = DecisionTreeClassifier()
tree_arrest.fit(X_train_arrest, y_train_arrest)
tree_arrest.feature_importances_
# Accuracy?
tree_arrest.fit(X_train_arrest, y_train_arrest)
print("Accuracy on training set: {:.3f}".format(tree_arrest.score(X_train_arrest, y_train_arrest)))
print("Accuracy on test set: {:.3f}".format(tree_arrest.score(X_test_arrest, y_test_arrest)))
#Confusion Matrix
y_pred_dt = tree_arrest.predict(X_test_arrest)
print(confusion_matrix(y_test_arrest, y_pred_dt))
print('Precision score: {:.4f}'.format(precision_score(y_test_arrest,y_pred_dt)))
print('Recall score: {:.4f}'.format(recall_score(y_test_arrest,y_pred_dt)))
print('Accuracy score: {:.4f}'.format(accuracy_score(y_test_arrest,y_pred_dt)))
print('F1 score: {:.4f}'.format(f1_score(y_test_arrest,y_pred_dt)))
#ROC curve
dt_auc = roc_auc_score(y_test_arrest, tree_arrest.predict_proba(X_test_arrest)[:,1])
print("AUC for Decision tree: {:.3f}".format(dt_auc))
y_pred_proba_dt = tree_arrest.predict_proba(X_test_arrest)[::,1]
fpr, tpr, threshold = roc_curve(y_test_arrest, y_pred_proba_dt)
auc = roc_auc_score(y_test_arrest, y_pred_proba_dt)
plt.plot(fpr,tpr,label="auc="+str(auc))
plt.legend(loc=4)
plt.ylabel('Recall')
plt.xlabel('1-specificity')
plt.title('ROC Curve')
plt.show()
# # Referenced to plot tree - https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
from sklearn.tree import export_graphviz
export_graphviz(
tree_arrest,
feature_names=features_arrest,
out_file="tree_arrest.dot",
max_depth=10
)
import graphviz
from IPython.display import display
with open("tree_arrest.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))